data_2021 Cleaning y descripción¶
Decription¶
In [ ]:
# importar librerias necesarias para analisis de datos y visualizacion
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# cargar datos con separacion ;
data_2021 = pd.read_csv("INFE_Spain_csv_2021.csv", sep=";")
data_2021
Out[ ]:
| ID | wght | QD1 | QD2 | QD3 | QD4 | QD5_1 | QD5_2 | QD5_3 | QD5_4 | ... | QD11_4 | QD11_5 | QD11_6 | QD11_7 | QD11_8 | QD11_9 | QD11_10 | QD11_99 | QD12 | QD13 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2594.2485 | 0 | 8 | if equal or above 15000 | Spanish | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Between 15000 and 47000 euro |
| 1 | 2 | 1825.7745 | 1 | 11 | if below 15000 | Spanish | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Between 15000 and 47000 euro |
| 2 | 3 | 3207.1128 | 1 | 14 | if equal or above 15000 | Spanish | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Below 15000 euro -bottom 25 |
| 3 | 4 | 3009.3894 | 0 | 8 | if below 15000 | Spanish | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Above 47000 euro -top 25 |
| 4 | 5 | 5779.9355 | 1 | 9 | if equal or above 15000 | Spanish | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Between 15000 and 47000 euro |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7759 | 7760 | 4408.7930 | 1 | 4 | if below 15000 | Spanish | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Between 15000 and 47000 euro |
| 7760 | 7761 | 11130.0900 | 0 | 10 | if equal or above 15000 | Spanish | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Between 15000 and 47000 euro |
| 7761 | 7762 | 1201.2681 | 1 | 11 | if equal or above 15000 | Spanish | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Do not know |
| 7762 | 7763 | 7075.0142 | 0 | 12 | if equal or above 15000 | Spanish | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Refuse to answer |
| 7763 | 7764 | 5978.3618 | 0 | 12 | if below 15000 | Spanish | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Below 15000 euro -bottom 25 |
7764 rows × 194 columns
In [ ]:
# convertir columna id en indice
data_2021 = data_2021.set_index("ID")
data_2021.head()
Out[ ]:
| wght | QD1 | QD2 | QD3 | QD4 | QD5_1 | QD5_2 | QD5_3 | QD5_4 | QD5_5 | ... | QD11_4 | QD11_5 | QD11_6 | QD11_7 | QD11_8 | QD11_9 | QD11_10 | QD11_99 | QD12 | QD13 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||||||||
| 1 | 2594.2485 | 0 | 8 | if equal or above 15000 | Spanish | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Between 15000 and 47000 euro |
| 2 | 1825.7745 | 1 | 11 | if below 15000 | Spanish | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Between 15000 and 47000 euro |
| 3 | 3207.1128 | 1 | 14 | if equal or above 15000 | Spanish | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Below 15000 euro -bottom 25 |
| 4 | 3009.3894 | 0 | 8 | if below 15000 | Spanish | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Above 47000 euro -top 25 |
| 5 | 5779.9355 | 1 | 9 | if equal or above 15000 | Spanish | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Between 15000 and 47000 euro |
5 rows × 193 columns
In [ ]:
# Mostrar estadísticas descriptivas de las columnas numéricas
print(data_2021.describe())
# Verificar tipos de datos y valores faltantes
print(data_2021.info())
wght QD1 QD2 QD5_1 QD5_2 \
count 7764.000000 7764.000000 7764.000000 7764.000000 7764.000000
mean 4639.768677 0.518805 8.822257 0.123776 0.621716
std 2990.281480 0.499678 4.787225 0.329348 0.484990
min 420.189910 0.000000 1.000000 0.000000 0.000000
25% 2446.758500 0.000000 5.000000 0.000000 0.000000
50% 3852.094450 1.000000 9.000000 0.000000 1.000000
75% 6402.205600 1.000000 13.000000 0.000000 1.000000
max 19411.416000 1.000000 17.000000 1.000000 1.000000
QD5_3 QD5_4 QD5_5 QD5_6 QD5_7 ... \
count 7764.000000 7764.000000 7764.000000 7764.000000 7764.000000 ...
mean 0.269835 0.209557 0.231195 0.002061 0.004894 ...
std 0.443903 0.407019 0.421624 0.045352 0.069793 ...
min 0.000000 0.000000 0.000000 0.000000 0.000000 ...
25% 0.000000 0.000000 0.000000 0.000000 0.000000 ...
50% 0.000000 0.000000 0.000000 0.000000 0.000000 ...
75% 1.000000 0.000000 0.000000 0.000000 0.000000 ...
max 1.000000 1.000000 1.000000 1.000000 1.000000 ...
QD11_2 QD11_4 QD11_5 QD11_6 QD11_7 \
count 7764.000000 7764.000000 7764.000000 7764.000000 7764.000000
mean 0.003349 0.068650 -0.011077 -0.019964 -0.020350
std 1.161111 1.186923 1.154833 1.150859 1.150684
min -99.000000 -99.000000 -99.000000 -99.000000 -99.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000
QD11_8 QD11_9 QD11_10 QD11_99 QD12
count 7764.000000 7764.000000 7764.000000 7764.000000 7764.000000
mean -0.020994 0.014297 -0.021638 0.000129 0.878928
std 1.150393 1.165734 1.150101 0.011349 0.326231
min -99.000000 -99.000000 -99.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 1.000000
50% 0.000000 0.000000 0.000000 0.000000 1.000000
75% 0.000000 0.000000 0.000000 0.000000 1.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000
[8 rows x 189 columns]
<class 'pandas.core.frame.DataFrame'>
Index: 7764 entries, 1 to 7764
Columns: 193 entries, wght to QD13
dtypes: float64(47), int64(142), object(4)
memory usage: 11.5+ MB
None
In [ ]:
# mostrar los tipos de datos de las columnas
data_2021.dtypes
Out[ ]:
wght float64
QD1 int64
QD2 int64
QD3 object
QD4 object
...
QD11_9 int64
QD11_10 int64
QD11_99 int64
QD12 int64
QD13 object
Length: 193, dtype: object
In [ ]:
# cambiar valores de columna QD13 para que sean mas cortos
data_2021["QD13"] = data_2021["QD13"].replace({'Between 15000 and 47000 euro': '15000 - 47000 €',
'Below 15000 euro -bottom 25': 'Below 15000€',
'Above 47000 euro -top 25' : 'Above 47000 €',
'Missing because of interviewer or CAPI error': 'Interviewer or CAPI error'
})
data_2021["QD13"].value_counts()
Out[ ]:
QD13 15000 - 47000 € 4060 Below 15000€ 1712 Above 47000 € 1510 Do not know 240 Refuse to answer 222 Interviewer or CAPI error 20 Name: count, dtype: int64
Distribución de las edades¶
In [ ]:
def classify_age_band(age):
if 18 <= age <= 19:
return 'teenager'
elif 20 <= age <= 29:
return '20s'
elif 30 <= age <= 39:
return '30s'
elif 40 <= age <= 49:
return '40s'
elif 50 <= age <= 59:
return '50s'
elif 60 <= age <= 69:
return '60s'
elif 70 <= age <= 79:
return '70s'
elif age >= 80:
return '80+'
else:
return -99 # Para cualquier edad no contemplada o datos faltantes
# Aplicar la función para crear la nueva columna 'QD7_a'
data_2021['QD7_a'] = data_2021['QD7'].apply(classify_age_band)
# Verificar los resultados
print(data_2021[['QD7', 'QD7_a']].head())
QD7 QD7_a ID 1 56 50s 2 34 30s 3 39 30s 4 40 40s 5 46 40s
In [ ]:
data_2021['QD7'].describe(include='all')
Out[ ]:
count 7764.000000 mean 46.959299 std 15.649587 min 18.000000 25% 35.000000 50% 47.000000 75% 59.000000 max 80.000000 Name: QD7, dtype: float64
In [ ]:
# histograma con la distribución de las edades (columna QD7)
plt.figure(figsize=(10, 6))
sns.histplot(data = data_2021, x="QD7", bins=20, kde=True)
plt.title("Distribución de edades")
plt.xlabel("Edad")
plt.ylabel("Frecuencia")
plt.show()
Missing values¶
In [ ]:
# Generando una tabla con la cantidad de valores missing para cada columna y su porcentje
missing_values = data_2021.isnull().sum()
missing_values_percent = 100 * data_2021.isnull().sum() / len(data_2021)
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
missing_values_table = missing_values_table.rename(columns={0: "Missing Values", 1: "%"})
missing_values_table = missing_values_table[missing_values_table.iloc[:, 1] != 0].sort_values("%", ascending=False).round(1)
print("Hay " + str(missing_values_table.shape[0]) + " columnas con valores faltantes")
missing_values_table
Hay 43 columnas con valores faltantes
Out[ ]:
| Missing Values | % | |
|---|---|---|
| QF3_4 | 7764 | 100.0 |
| QP7_2 | 7764 | 100.0 |
| QP2_17 | 7764 | 100.0 |
| QP3_4 | 7764 | 100.0 |
| QP3_6 | 7764 | 100.0 |
| QP3_8 | 7764 | 100.0 |
| QP3_10 | 7764 | 100.0 |
| QP3_14 | 7764 | 100.0 |
| QP3_15 | 7764 | 100.0 |
| QP3_17 | 7764 | 100.0 |
| QP7_6 | 7764 | 100.0 |
| QP2_14 | 7764 | 100.0 |
| QP10_1 | 7764 | 100.0 |
| QP10_2 | 7764 | 100.0 |
| QP10_4 | 7764 | 100.0 |
| QP10_5 | 7764 | 100.0 |
| QP10_6 | 7764 | 100.0 |
| QP10_7 | 7764 | 100.0 |
| QP10_8 | 7764 | 100.0 |
| QP10_9 | 7764 | 100.0 |
| QP2_15 | 7764 | 100.0 |
| QP2_10 | 7764 | 100.0 |
| QF3_5 | 7764 | 100.0 |
| QF12_5_4 | 7764 | 100.0 |
| QF9_10 | 7764 | 100.0 |
| QF9_11 | 7764 | 100.0 |
| QF12_1_2 | 7764 | 100.0 |
| QF12_2_1 | 7764 | 100.0 |
| QF12_2_3 | 7764 | 100.0 |
| QF12_3_4 | 7764 | 100.0 |
| QF12_3_5 | 7764 | 100.0 |
| QF12_5_2 | 7764 | 100.0 |
| QF12_5_5 | 7764 | 100.0 |
| QP2_6 | 7764 | 100.0 |
| QP1_4 | 7764 | 100.0 |
| QP1_6 | 7764 | 100.0 |
| QP1_8 | 7764 | 100.0 |
| QP1_10 | 7764 | 100.0 |
| QP1_14 | 7764 | 100.0 |
| QP1_15 | 7764 | 100.0 |
| QP1_17 | 7764 | 100.0 |
| QP2_4 | 7764 | 100.0 |
| QP10_10 | 7764 | 100.0 |
In [ ]:
#eliminar todas las columnas con valores nulos ya que el 100% de los datos son nulos
data_2021 = data_2021.dropna(axis=1)
data_2021
Out[ ]:
| wght | QD1 | QD2 | QD3 | QD4 | QD5_1 | QD5_2 | QD5_3 | QD5_4 | QD5_5 | ... | QD11_5 | QD11_6 | QD11_7 | QD11_8 | QD11_9 | QD11_10 | QD11_99 | QD12 | QD13 | QD7_a | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||||||||
| 1 | 2594.2485 | 0 | 8 | if equal or above 15000 | Spanish | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 15000 - 47000 € | 50s |
| 2 | 1825.7745 | 1 | 11 | if below 15000 | Spanish | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 15000 - 47000 € | 30s |
| 3 | 3207.1128 | 1 | 14 | if equal or above 15000 | Spanish | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Below 15000€ | 30s |
| 4 | 3009.3894 | 0 | 8 | if below 15000 | Spanish | 0 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Above 47000 € | 40s |
| 5 | 5779.9355 | 1 | 9 | if equal or above 15000 | Spanish | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15000 - 47000 € | 40s |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7760 | 4408.7930 | 1 | 4 | if below 15000 | Spanish | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 15000 - 47000 € | 50s |
| 7761 | 11130.0900 | 0 | 10 | if equal or above 15000 | Spanish | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 15000 - 47000 € | 60s |
| 7762 | 1201.2681 | 1 | 11 | if equal or above 15000 | Spanish | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Do not know | 30s |
| 7763 | 7075.0142 | 0 | 12 | if equal or above 15000 | Spanish | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Refuse to answer | 60s |
| 7764 | 5978.3618 | 0 | 12 | if below 15000 | Spanish | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Below 15000€ | 50s |
7764 rows × 151 columns
In [ ]:
# mostrar columnas con valores missing
data_2021.columns[data_2021.isnull().sum() > 0]
Out[ ]:
Index([], dtype='object')
Columnas tipo object (encoding?)¶
In [ ]:
# mostrar columnas con valores tipo object
data_2021.dtypes[data_2021.dtypes == "object"]
Out[ ]:
QD3 object QD4 object QP4 object QD13 object QD7_a object dtype: object
In [ ]:
# mostrar valores de la columna QD3
data_2021["QD3"].value_counts()
Out[ ]:
QD3 if equal or above 15000 5361 if below 15000 2403 Name: count, dtype: int64
In [ ]:
# label encoding para la columna QD3
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_2021['QD3_encoded'] = le.fit_transform(data_2021['QD3'])
# elimiar columna QD3
# data_2021 = data_2021.drop("QD3", axis=1)
data_2021['QD3_encoded'].value_counts()
Out[ ]:
QD3_encoded 1 5361 0 2403 Name: count, dtype: int64
In [ ]:
# mostrar valores columna QD4
print(data_2021["QD4"].value_counts())
QD4 Spanish 7764 Name: count, dtype: int64
In [ ]:
# ELIMINAR COLUMNA QD4
data_2021 = data_2021.drop(columns="QD4")
In [ ]:
# mostrar valores columna QD13
print(data_2021["QD13"].value_counts())
QD13 15000 - 47000 € 4060 Below 15000€ 1712 Above 47000 € 1510 Do not know 240 Refuse to answer 222 Interviewer or CAPI error 20 Name: count, dtype: int64
In [ ]:
data_encoded_2021 = pd.get_dummies(data_2021, columns=['QD13'], prefix='Income')
print(data_encoded_2021.head())
wght QD1 QD2 QD3 QD5_1 QD5_2 QD5_3 QD5_4 \
ID
1 2594.2485 0 8 if equal or above 15000 0 0 0 1
2 1825.7745 1 11 if below 15000 0 1 1 0
3 3207.1128 1 14 if equal or above 15000 0 1 1 0
4 3009.3894 0 8 if below 15000 0 1 1 0
5 5779.9355 1 9 if equal or above 15000 0 1 0 1
QD5_5 QD5_6 ... QD11_99 QD12 QD7_a QD3_encoded \
ID ...
1 0 0 ... 0 1 50s 1
2 0 0 ... 0 1 30s 0
3 0 0 ... 0 0 30s 1
4 0 0 ... 0 1 40s 0
5 0 0 ... 0 0 40s 1
Income_15000 - 47000 € Income_Above 47000 € Income_Below 15000€ \
ID
1 True False False
2 True False False
3 False False True
4 False True False
5 True False False
Income_Do not know Income_Interviewer or CAPI error \
ID
1 False False
2 False False
3 False False
4 False False
5 False False
Income_Refuse to answer
ID
1 False
2 False
3 False
4 False
5 False
[5 rows x 156 columns]
In [ ]:
# Identificar las columnas generadas por pd.get_dummies()
dummy_columns = [col for col in data_encoded_2021 if col.startswith('Income')]
# Convertir sólo esas columnas a uint8 para que salgan como 0 y 1
for column in dummy_columns:
data_encoded_2021[column] = data_encoded_2021[column].astype('uint8')
print(data_encoded_2021[dummy_columns].head())
Income_15000 - 47000 € Income_Above 47000 € Income_Below 15000€ \
ID
1 1 0 0
2 1 0 0
3 0 0 1
4 0 1 0
5 1 0 0
Income_Do not know Income_Interviewer or CAPI error \
ID
1 0 0
2 0 0
3 0 0
4 0 0
5 0 0
Income_Refuse to answer
ID
1 0
2 0
3 0
4 0
5 0
In [ ]:
# mostrar valores columna QP4
print(data_encoded_2021["QP4"].value_counts())
QP4 -98 5930 An unsecured bank loan 377 Insurance 304 A credit card 261 An investment account 236 Crypto-assets 180 Stocks and shares 164 A mortgage 161 A savings account 61 A pension or retirement product 48 0 27 Bonds 8 -97 4 -5 2 -99 1 Name: count, dtype: int64
In [ ]:
# Convertir los códigos a strings para que representen categorías
def label_product(row):
if row == '-99':
return 'Refused_to_answer'
elif row == '-97':
return 'Dont_know'
elif row == '0':
return 'Not_voluntarily_chosen'
elif row == '-98':
return 'Not_applicable'
elif row == '-5':
return 'Error'
else:
return f'{row}'
data_encoded_2021['QP4'] = data_encoded_2021['QP4'].apply(label_product)
print(data_encoded_2021['QP4'].value_counts())
# Aplicar One-Hot Encoding
data_encoded_2021 = pd.get_dummies(data_encoded_2021, columns=['QP4'], prefix='Product')
# Verificamos el resultado
print(data_encoded_2021.filter(regex='Product_').head())
QP4
Not_applicable 5930
An unsecured bank loan 377
Insurance 304
A credit card 261
An investment account 236
Crypto-assets 180
Stocks and shares 164
A mortgage 161
A savings account 61
A pension or retirement product 48
Not_voluntarily_chosen 27
Bonds 8
Dont_know 4
Error 2
Refused_to_answer 1
Name: count, dtype: int64
Product_A credit card Product_A mortgage \
ID
1 False False
2 False True
3 False False
4 False False
5 False False
Product_A pension or retirement product Product_A savings account \
ID
1 False False
2 False False
3 False False
4 False False
5 False False
Product_An investment account Product_An unsecured bank loan \
ID
1 False False
2 False False
3 False False
4 False True
5 False False
Product_Bonds Product_Crypto-assets Product_Dont_know Product_Error \
ID
1 False False False False
2 False False False False
3 False False False False
4 False False False False
5 False False False False
Product_Insurance Product_Not_applicable Product_Not_voluntarily_chosen \
ID
1 False True False
2 False False False
3 False True False
4 False False False
5 True False False
Product_Refused_to_answer Product_Stocks and shares
ID
1 False False
2 False False
3 False False
4 False False
5 False False
In [ ]:
# Identificar las columnas generadas por pd.get_dummies()
dummy_columns = [col for col in data_encoded_2021 if col.startswith('Product')]
# Convertir sólo esas columnas a uint8 para que salgan como 0 y 1
for column in dummy_columns:
data_encoded_2021[column] = data_encoded_2021[column].astype('uint8')
print(data_encoded_2021[dummy_columns].head())
Product_A credit card Product_A mortgage \
ID
1 0 0
2 0 1
3 0 0
4 0 0
5 0 0
Product_A pension or retirement product Product_A savings account \
ID
1 0 0
2 0 0
3 0 0
4 0 0
5 0 0
Product_An investment account Product_An unsecured bank loan \
ID
1 0 0
2 0 0
3 0 0
4 0 1
5 0 0
Product_Bonds Product_Crypto-assets Product_Dont_know Product_Error \
ID
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
5 0 0 0 0
Product_Insurance Product_Not_applicable Product_Not_voluntarily_chosen \
ID
1 0 1 0
2 0 0 0
3 0 1 0
4 0 0 0
5 1 0 0
Product_Refused_to_answer Product_Stocks and shares
ID
1 0 0
2 0 0
3 0 0
4 0 0
5 0 0
In [ ]:
#ver las ultimas 30 columnas y todas las filas
data_encoded_2021.iloc[:, -20:]
Out[ ]:
| Income_Above 47000 € | Income_Below 15000€ | Income_Do not know | Income_Interviewer or CAPI error | Income_Refuse to answer | Product_A credit card | Product_A mortgage | Product_A pension or retirement product | Product_A savings account | Product_An investment account | Product_An unsecured bank loan | Product_Bonds | Product_Crypto-assets | Product_Dont_know | Product_Error | Product_Insurance | Product_Not_applicable | Product_Not_voluntarily_chosen | Product_Refused_to_answer | Product_Stocks and shares | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | ||||||||||||||||||||
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7760 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 7761 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 7762 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 7763 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 7764 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
7764 rows × 20 columns
In [ ]:
# eliminar QD3 DE LA TABLA data_encoded_2021
data_encoded_2021 = data_encoded_2021.drop(columns=['QD7_a'])
deteccion de outliers¶
In [ ]:
#deteccion de outliers en todas las columnas
import matplotlib.pyplot as plt
import seaborn as sns
# Gráfico boxplot para la columna 'wght'
plt.figure(figsize=(10, 6))
sns.boxplot(data_encoded_2021['wght'])
plt.title('Boxplot de la columna wght')
plt.show()
# Calculando el IQR para la columna 'wght'
Q1 = data_encoded_2021['wght'].quantile(0.25)
Q3 = data_encoded_2021['wght'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Identificar outliers
outliers = data_encoded_2021[(data_encoded_2021['wght'] < lower_bound) | (data_encoded_2021['wght'] > upper_bound)]
print('Número de outliers detectados:', len(outliers))
print('Límite inferior para outliers:', lower_bound)
print('Límite superior para outliers:', upper_bound)
Número de outliers detectados: 140 Límite inferior para outliers: -3486.41215 Límite superior para outliers: 12335.376250000001
Correlación entre variables de interes¶
In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
binary_columns = [ 'QF2_1', 'QF2_5', 'QF3_1', 'QF3_2', 'QF3_3', 'QF3_6', 'QF3_7', 'QF3_8',
'QF9_1', 'QF9_2', 'QF9_3', 'QF9_4', 'QF9_5', 'QF9_6', 'QF9_7', 'QF9_8', 'QF9_9', 'QF9_12', 'QF11', 'Income_Below 15000€', 'Income_Above 47000 €', 'Income_15000 - 47000 €']
continuous_columns = ['QD7', 'QD2', 'QD5_3', 'QD5_ch', 'QD9_ISCED', 'QD10', 'QK1']
# Lista de preguntas de interés
# Filtrar las columnas binarias para incluir solo 0 y 1
filtered_data = data_encoded_2021[binary_columns].apply(lambda x: x.isin([0, 1]))
# Crea un DataFrame que incluya solo las filas válidas para las columnas binarias
cleaned_binary_data = data_encoded_2021.loc[filtered_data.all(axis=1), binary_columns]
# Combina los datos binarios filtrados con los datos continuos
combined_data = pd.concat([cleaned_binary_data, data_encoded_2021[continuous_columns]], axis=1)
# Calcula la matriz de correlación
correlation_matrix = combined_data.corr()
# Visualiza la matriz de correlación
plt.figure(figsize=(30, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Matriz de Correlación para Preguntas Seleccionadas')
plt.show()
In [ ]:
import pandas as pd
# Asegúrate de que QD7 y QD9_ISCED están en formato numérico adecuado
data_encoded_2021['QD7'] = pd.to_numeric(data_encoded_2021['QD7'], errors='coerce') # Convierte a numérico, maneja los errores
data_encoded_2021['QD9_ISCED'] = pd.to_numeric(data_encoded_2021['QD9_ISCED'], errors='coerce') # Convierte a numérico, maneja los errores
# Calcula la correlación de Pearson
correlation = data_encoded_2021['QD7'].corr(data_encoded_2021['QD9_ISCED'])
print(f"La correlación entre QD7 (Edad) y QD9_ISCED (Nivel de Educación ISCED) es: {correlation}")
La correlación entre QD7 (Edad) y QD9_ISCED (Nivel de Educación ISCED) es: -0.1828594443351182
BARPLOTS VARIABLES¶
In [ ]:
#BARPLOT QD9_ISCED EN COLORES
data_EDUCATION_LEVEL = data_encoded_2021['QD9_ISCED'].replace({
0: 'Early childhood education',
1: 'Primary education',
2: 'Lower secondary education',
3: 'Upper secondary education',
4: 'Post-secondary non-tertiary education',
5: 'Short-cycle tertiary education',
6: 'Bachelor’s or equivalent level',
7: 'Master’s or equivalent level',
8: 'Doctoral or equivalent level'
})
#BARPLOT DATA_EDUCATION_LEVEL EN COLORES
plt.figure(figsize=(10, 6))
sns.countplot(data = data_EDUCATION_LEVEL, palette= 'viridis')
plt.title('Nivel de educación')
plt.xlabel('Count')
plt.show()
/tmp/ipykernel_2642/1692575795.py:15: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data = data_EDUCATION_LEVEL, palette= 'viridis')
In [ ]:
questions = ['QF3_1', 'QF3_2', 'QF3_3', 'QF3_6', 'QF3_7', 'QF3_8']
titulos = ['Saving cash at home or in your wallet',
'Paying money into a savings/deposit account',
'Giving money to family to save on your behalf',
'Investing in crypto-assets or ICOs',
'Investing in stocks and shares',
'Saving or investing in some other way, other than a pension']
# # filtrar por sexo femenino
# feminas = data_encoded_2021[data_encoded_2021['QD1'] == 0]
# Set up the matplotlib figure and axes
fig, axs = plt.subplots(1, len(questions), figsize=(25, 6), sharey=True)
for i, question in enumerate(questions):
# Calculate the percentage of each category within the question
percent_data = data_encoded_2021[question].value_counts(normalize=True).sort_index() * 100
# Create a bar plot for each category's percentage
percent_data.plot(kind='bar', ax=axs[i], color=sns.color_palette('viridis', len(percent_data)))
axs[i].set_title(titulos[i])
axs[i].set_xlabel('Categories')
axs[i].set_ylabel('Percentage')
# Annotate the percentage on the bars
for p in axs[i].patches:
axs[i].annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Set a common y-label
fig.text(0.04, 0.5, 'Percentage', va='center', rotation='vertical')
plt.tight_layout()
plt.show()
Ante la pregunta: In the past 12 months have you been [personally] saving money in any of the following ways, whether or not you still have the money? solo el 7% dijo que si a haber invertido en acciones. el 28,5% dijo que guardaba dinero en cash y el 55% dijo que si a pagar dinero a una cuenta de ahorro
In [ ]:
## barplot de la variable QF8 (sin valores -99, -97 y-5) EN PORCENTAJES
data_retirement = data_encoded_2021[data_encoded_2021['QF8'].isin([1, 2, 3, 4, 5, 6])]
percent_data = data_retirement['QF8'].value_counts(normalize=True).sort_index() * 100
percent_data.plot(kind='bar', color=sns.color_palette('viridis', len(percent_data)))
plt.title('rate from 1 to 5 how well they are preparing their retirement (if not retired yet). If retired, how well did they prepare their retirement.')
plt.xlabel('Categories')
plt.ylabel('Percentage')
plt.xticks([0, 1, 2, 3, 4, 5],['very confident', '2', '3', '4', 'not at all confident', 'no retirement plan'], rotation = 45)
for p in plt.gca().patches:
plt.gca().annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
In [ ]:
questions = ['QF9_1', 'QF9_2', 'QF9_3', 'QF9_4', 'QF9_5', 'QF9_6', 'QF9_7', 'QF9_8', 'QF9_9', 'QF9_12', 'QF9_97', 'QF9_99']
titles_qf9 = [
'Draw a government pension/old-age benefit',
'Draw an occupational or workplace pension plan',
'Draw a private pension plan',
'Sell financial assets (such as stocks, bonds or mutual funds)',
'Sell your non-financial assets',
'From income generated by your financial/non-financial assets',
'Rely on a spouse/partner income',
'Rely on your children/other family members to support you',
'Draw on your savings',
'Something else',
'Do not know ',
'Refused to answer the entire question'
]
# Configura la figura de matplotlib y los ejes
fig, axs = plt.subplots(2, len(questions)//2 + len(questions)%2, figsize=(30, 10), sharey=True)
axs = axs.flatten() # Aplana el array para facilitar la iteración
for i, question in enumerate(questions):
# Calcula el porcentaje de cada categoría basado en el total de respuestas
total_responses = len(data_encoded_2021[question])
value_counts = data_encoded_2021[question].value_counts().sort_index()
percentages = (value_counts / total_responses) * 100
# Selecciona solo los porcentajes para 0 y 1 para visualizar en el gráfico
display_percentages = percentages.loc[[0, 1]]
# Crea un gráfico de barras para las categorías 0 y 1
sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis')
axs[i].set_title(titles_qf9[i])
axs[i].set_xlabel('Categories')
axs[i].set_ylabel('Percentage')
# Anota los porcentajes reales en las barras
for index in display_percentages.index:
height = display_percentages[index]
axs[i].annotate(f'{height:.1f}%', (index, height),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Establece una etiqueta común para el eje Y
fig.text(0.04, 0.5, 'Percentage', va='center', rotation='vertical')
# Elimina los subgráficos vacíos si el número de preguntas no llena exactamente la cuadrícula de subgráficos
for ax in axs[len(questions):]:
fig.delaxes(ax)
plt.tight_layout()
plt.show()
/tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis') /tmp/ipykernel_2642/150885677.py:31: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=display_percentages.index, y=display_percentages.values, ax=axs[i], palette='viridis')
In [ ]:
# BARPLOT variable QF11 (cambia -99 por "Refused to answer", -98 por "Not applicable", -97 por "Dont know", 0 por "No", 1 por "Yes")
data_income_expenses = data_encoded_2021['QF11'].replace({
-99: 'Refused ',
-5: 'Programming error',
-97: 'Dont know',
0: 'No',
1: 'Yes'
})
percent_data = data_income_expenses.value_counts(normalize=True) * 100
percent_data.plot(kind='bar', color=sns.color_palette('viridis', len(percent_data)))
plt.title('Sometimes people find that their income does not quite cover their living expenses. In the last 12 months, has this happened to you, personally?')
plt.ylabel('Percentage')
plt.xticks(rotation=0)
for p in plt.gca().patches:
plt.gca().annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
El 22.6% de los encuestados cree que su income no cubre todos sus living expenses
In [ ]:
# filtra por los que respondieron 1 a QF11
expenses_higher_income = data_2021[data_2021['QF11'] == 1]
print(expenses_higher_income['QF11'].value_counts())
# Análisis demográfico
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 5))
sns.countplot(x= 'QD13', data=expenses_higher_income, ax=axes[0])
sns.countplot(x= 'QD9_ISCED', data=expenses_higher_income, ax=axes[1])
sns.histplot(x= 'QD7', data=expenses_higher_income, ax=axes[2])
sns.countplot(x= 'QD2', data=expenses_higher_income, ax=axes[3])
axes[0].set_title('Distribución de ingresos')
axes[1].set_title('Nivel de Educación')
axes[2].set_title('Distribución de Edades')
axes[3].set_title('Distribución de Regiones')
plt.tight_layout()
plt.show()
QF11 1 1752 Name: count, dtype: int64
In [ ]:
# COuntplot de qd2
plt.figure(figsize=(10, 6))
sns.countplot(x='QD2', data=data_2021, palette='viridis')
plt.title('Distribución de Regiones')
plt.xlabel('Regiones')
plt.ylabel('Count')
plt.show()
/tmp/ipykernel_2642/1584208492.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='QD2', data=data_2021, palette='viridis')
In [ ]:
data_2021['QD13'].value_counts()
Out[ ]:
QD13 15000 - 47000 € 4060 Below 15000€ 1712 Above 47000 € 1510 Do not know 240 Refuse to answer 222 Interviewer or CAPI error 20 Name: count, dtype: int64
In [ ]:
# ver ultimas 40 columnas de data_encoded_2021
data_encoded_2021.iloc[:, -25:]
Out[ ]:
| QD11_10 | QD11_99 | QD12 | QD3_encoded | Income_15000 - 47000 € | Income_Above 47000 € | Income_Below 15000€ | Income_Do not know | Income_Interviewer or CAPI error | Income_Refuse to answer | ... | Product_An unsecured bank loan | Product_Bonds | Product_Crypto-assets | Product_Dont_know | Product_Error | Product_Insurance | Product_Not_applicable | Product_Not_voluntarily_chosen | Product_Refused_to_answer | Product_Stocks and shares | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||||||||
| 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7760 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 7761 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 7762 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 7763 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 7764 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
7764 rows × 25 columns
In [ ]:
# 1
# Calcula el total de respuestas y las respuestas "Sí" por región
total_por_region = data_2021['QD2'].value_counts()
si_por_region = data_2021[data_2021['QF11'] == 1]['QD2'].value_counts()
# Calcula la proporción de respuestas "Sí" en cada región
proporcion_si_por_region = (si_por_region / total_por_region) * 100
# 2
# Calcula el total de respuestas y las respuestas "Sí" por nivel de educación
total_por_educacion = data_2021['QD9_ISCED'].value_counts()
si_por_educacion = data_2021[data_2021['QF11'] == 1]['QD9_ISCED'].value_counts()
# Calcula la proporción de respuestas "Sí" en cada nivel de educación
proporcion_si_por_educacion = (si_por_educacion / total_por_educacion) * 100
# 3
# Calcula el total de respuestas y las respuestas "Sí" por nivel de ingresos
total_por_ingresos = data_2021['QD13'].value_counts()
si_por_ingresos = data_2021[data_2021['QF11'] == 1]['QD13'].value_counts()
# Calcula la proporción de respuestas "Sí" en cada nivel de ingresos
proporcion_si_por_ingresos = (si_por_ingresos / total_por_ingresos) * 100
In [ ]:
# Calcula el total de respuestas y las respuestas "Sí" por municipality size (QD3)
total_por_municipio = data_2021['QD3'].value_counts()
si_por_municipio = data_2021[data_2021['QF11'] == 1]['QD3'].value_counts()
# Calcula la proporción de respuestas "Sí" en cada tamaño de municipio
proporcion_si_por_municipio = (si_por_municipio / total_por_municipio) * 100
In [ ]:
# Configura la figura y los ejes
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(25, 8)) # Ajusta el tamaño según sea necesario
# Gráfico 1: Porcentaje de Respuestas "Sí" a QF11 por Región
sns.barplot(x=proporcion_si_por_region.index, y=proporcion_si_por_region, ax=axes[0], palette= 'muted')
axes[0].set_title('Porcentaje de Respuestas "Sí" a QF11 por Región')
axes[0].set_xlabel('Región')
axes[0].set_ylabel('Porcentaje')
axes[0].set_xticklabels(['Andalucía', 'Aragón', 'Asturias', 'Baleares', 'Canarias', 'Cantabria', 'Castilla y León', 'Castilla-La Mancha', 'Cataluña', 'Valencia', 'Extremadura', 'Galicia', 'Madrid', 'Murcia', 'Navarra', 'País Vasco', 'La Rioja'], rotation=60)
for p in axes[0].patches:
axes[0].annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Gráfico 2: Porcentaje de Respuestas "Sí" a QF11 por Nivel de Educación
sns.barplot(x=proporcion_si_por_educacion.index, y=proporcion_si_por_educacion, ax=axes[1], palette= 'muted')
axes[1].set_title('Porcentaje de Respuestas "Sí" a QF11 por Nivel de Educación')
axes[1].set_xlabel('Nivel de Educación ISCED')
axes[1].set_ylabel('Porcentaje')
axes[1].set_xticklabels(['Early childhood education', 'Primary education', 'Lower secondary education', 'Upper secondary education', 'Post-secondary non-tertiary education', 'Short-cycle tertiary education', 'Bachelor’s or equivalent level', 'Master’s or equivalent level', 'Doctoral or equivalent level'], rotation=60)
for p in axes[1].patches:
axes[1].annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Gráfico 3: Porcentaje de Respuestas "Sí" a QF11 por Nivel de Ingresos
sns.barplot(x=proporcion_si_por_ingresos.index, y=proporcion_si_por_ingresos, ax=axes[2], palette= 'muted')
axes[2].set_title('Porcentaje de Respuestas "Sí" a QF11 por Nivel de Ingresos')
axes[2].set_xlabel('Nivel de Ingresos')
axes[2].set_ylabel('Porcentaje')
for p in axes[2].patches:
axes[2].annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.tight_layout()
plt.show()
/tmp/ipykernel_2642/1998170681.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=proporcion_si_por_region.index, y=proporcion_si_por_region, ax=axes[0], palette= 'muted') /tmp/ipykernel_2642/1998170681.py:9: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. axes[0].set_xticklabels(['Andalucía', 'Aragón', 'Asturias', 'Baleares', 'Canarias', 'Cantabria', 'Castilla y León', 'Castilla-La Mancha', 'Cataluña', 'Valencia', 'Extremadura', 'Galicia', 'Madrid', 'Murcia', 'Navarra', 'País Vasco', 'La Rioja'], rotation=60) /tmp/ipykernel_2642/1998170681.py:14: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=proporcion_si_por_educacion.index, y=proporcion_si_por_educacion, ax=axes[1], palette= 'muted') /tmp/ipykernel_2642/1998170681.py:18: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. axes[1].set_xticklabels(['Early childhood education', 'Primary education', 'Lower secondary education', 'Upper secondary education', 'Post-secondary non-tertiary education', 'Short-cycle tertiary education', 'Bachelor’s or equivalent level', 'Master’s or equivalent level', 'Doctoral or equivalent level'], rotation=60) /tmp/ipykernel_2642/1998170681.py:23: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=proporcion_si_por_ingresos.index, y=proporcion_si_por_ingresos, ax=axes[2], palette= 'muted')
In [ ]:
data_2021[data_2021['QP4'] != '-98'].value_counts('QP4',normalize = True) *100
Out[ ]:
QP4 An unsecured bank loan 20.556161 Insurance 16.575791 A credit card 14.231189 An investment account 12.868048 Crypto-assets 9.814613 Stocks and shares 8.942203 A mortgage 8.778626 A savings account 3.326063 A pension or retirement product 2.617230 0 1.472192 Bonds 0.436205 -97 0.218103 -5 0.109051 -99 0.054526 Name: proportion, dtype: float64
In [ ]:
#filtrar valores QP4. todos menos -98
barplot_qp4 = data_2021[data_2021['QP4'] != '-98'].value_counts('QP4',normalize = True) *100
# VISUALIZAR EN UN BARPLOT
plt.figure(figsize=(20, 6))
sns.barplot(x=barplot_qp4.index, y=barplot_qp4, palette= 'viridis')
plt.title('Most recent product acquired')
plt.xlabel('QP4')
plt.ylabel('Percentage')
for p in plt.gca().patches:
plt.gca().annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()
/tmp/ipykernel_2642/2697532845.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=barplot_qp4.index, y=barplot_qp4, palette= 'viridis')
In [ ]:
data_encoded_2021['QP5'].value_counts()
Out[ ]:
QP5 -98 4226 1 1360 3 1080 2 908 4 140 -97 30 -5 14 -99 6 Name: count, dtype: int64
In [ ]:
# filtra por los que respondieron 1 a Qs1_8
expenses_higher_income = data_2021[data_2021['QS1_8'] == 1]
print(expenses_higher_income['QS1_8'].value_counts())
# Análisis demográfico
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 5))
sns.countplot(x= 'QD13', data=expenses_higher_income, ax=axes[0])
sns.countplot(x= 'QD9_ISCED', data=expenses_higher_income, ax=axes[1])
sns.histplot(x= 'QD7', data=expenses_higher_income, ax=axes[2])
sns.countplot(x= 'QD2', data=expenses_higher_income, ax=axes[3])
axes[0].set_title('Distribución de ingresos')
axes[1].set_title('Nivel de Educación')
axes[2].set_title('Distribución de Edades')
axes[3].set_title('Distribución de Regiones')
plt.tight_layout()
plt.show()
QS1_8 1 1387 Name: count, dtype: int64
In [ ]:
# distribucion respuestas pregunta QS1_4
plt.figure(figsize=(10, 6))
sns.countplot(x='QS1_4', data=data_2021, palette='viridis')
plt.title('Distribución de respuestas a QS1_4')
plt.xlabel('QS1_4')
plt.ylabel('Count')
plt.show()
/tmp/ipykernel_2642/4014492808.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x='QS1_4', data=data_2021, palette='viridis')
In [ ]:
# filtra por los que respondieron 1 a Qs1_8
expenses_higher_income = data_2021[data_2021['QS2_5'] == 1]
print(expenses_higher_income['QS2_5'].value_counts())
# Análisis demográfico
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 5))
sns.countplot(x= 'QD13', data=expenses_higher_income, ax=axes[0])
sns.countplot(x= 'QD9_ISCED', data=expenses_higher_income, ax=axes[1])
sns.histplot(x= 'QD7', data=expenses_higher_income, ax=axes[2])
sns.countplot(x= 'QD2', data=expenses_higher_income, ax=axes[3])
axes[0].set_title('Distribución de ingresos')
axes[1].set_title('Nivel de Educación')
axes[2].set_title('Distribución de Edades')
axes[3].set_title('Distribución de Regiones')
plt.tight_layout()
plt.show()
QS2_5 1 6663 Name: count, dtype: int64
In [ ]:
# distribucion respuestas pregunta QS3_9
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
sns.countplot(x= 'QS3_9', data=data_2021, ax=axes[0])
sns.countplot(x= 'QS3_11', data=data_2021, ax=axes[1])
axes[0].set_title('I am concerned that my money won’t last ')
axes[1].set_title('I tend to live for today and let tomorrow take care of itself')
axes[0].set_xticks([0, 1, 2, 3, 4, 5, 6],['refused', 'don’t know', '1=completely', '2', '3', '4', '5=not at all'], rotation = 45)
axes[1].set_xticks([0, 1, 2, 3, 4, 5, 6],['refused', 'don’t know', '1=completely', '2', '3', '4', '5=not at all'], rotation = 45)
plt.tight_layout()
plt.show()
In [ ]:
#PORCENTAJES QD12 BARPLOT
a = data_encoded_2021['QD12'].value_counts(normalize= True) *100
plt.figure(figsize=(10, 6))
sns.barplot(x=a.index, y=a, palette= 'viridis')
plt.title('Where you born in Spain?')
plt.xticks([0,1],['No', 'Yes'])
for p in plt.gca().patches:
plt.gca().annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
/tmp/ipykernel_2642/3033627799.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=a.index, y=a, palette= 'viridis')
financial knowledge (QK)¶
In [ ]:
# distribucion respuestas pregunta QK1
qk1 = data_encoded_2021['QK1'].value_counts(normalize= True) *100
plt.figure(figsize=(10, 6))
sns.barplot(x=qk1.index, y=qk1, palette= 'viridis')
plt.title('how you would rate your overall knowledge about financial matters compared with other adults in Spain')
plt.xticks([0,1,2,3,4,5,6],['dont know', 'refused', 'very high', 'quite high', 'about average', 'quite low', 'very low'])
for p in plt.gca().patches:
plt.gca().annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
/tmp/ipykernel_2642/3491748961.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=qk1.index, y=qk1, palette= 'viridis')
In [ ]:
# filtra por los que respondieron 1 a Qs1_8
expenses_higher_income = data_2021[data_2021['QK1'] == 2]
print(expenses_higher_income['QK1'].value_counts())
# Análisis demográfico
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 5))
sns.countplot(x= 'QD13', data=expenses_higher_income, ax=axes[0])
sns.countplot(x= 'QD9_ISCED', data=expenses_higher_income, ax=axes[1])
sns.histplot(x= 'QD7', data=expenses_higher_income, ax=axes[2])
sns.countplot(x= 'QD2', data=expenses_higher_income, ax=axes[3])
axes[0].set_title('Distribución de ingresos')
axes[1].set_title('Nivel de Educación')
axes[2].set_title('Distribución de Edades')
axes[3].set_title('Distribución de Regiones')
plt.tight_layout()
plt.show()
QK1 2 598 Name: count, dtype: int64
In [ ]:
# 1
# Calcula el total de respuestas y las respuestas "Sí" por región
total_por_region = data_2021['QD2'].value_counts()
si_por_region = data_2021[data_2021['QK1'] == 5]['QD2'].value_counts()
# Calcula la proporción de respuestas "Sí" en cada región
proporcion_si_por_region = (si_por_region / total_por_region) * 100
# 2
# Calcula el total de respuestas y las respuestas "Sí" por nivel de educación
total_por_educacion = data_2021['QD9_ISCED'].value_counts()
si_por_educacion = data_2021[data_2021['QK1'] == 5]['QD9_ISCED'].value_counts()
# Calcula la proporción de respuestas "Sí" en cada nivel de educación
proporcion_si_por_educacion = (si_por_educacion / total_por_educacion) * 100
# 3
# Calcula el total de respuestas y las respuestas "Sí" por nivel de ingresos
total_por_ingresos = data_2021['QD13'].value_counts()
si_por_ingresos = data_2021[data_2021['QK1'] == 5]['QD13'].value_counts()
# Calcula la proporción de respuestas "Sí" en cada nivel de ingresos
proporcion_si_por_ingresos = (si_por_ingresos / total_por_ingresos) * 100
In [ ]:
# Configura la figura y los ejes
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(25, 8)) # Ajusta el tamaño según sea necesario
# Gráfico 1: Porcentaje de Respuestas "Sí" a QK1 por Región
sns.barplot(x=proporcion_si_por_region.index, y=proporcion_si_por_region, ax=axes[0], palette= 'muted')
axes[0].set_title('Porcentaje de Respuestas "VERY LOW" a QK1 por Región')
axes[0].set_xlabel('Región')
axes[0].set_ylabel('Porcentaje')
axes[0].set_xticklabels(['Andalucía', 'Aragón', 'Asturias', 'Baleares', 'Canarias', 'Cantabria', 'Castilla y León', 'Castilla-La Mancha', 'Cataluña', 'Valencia', 'Extremadura', 'Galicia', 'Madrid', 'Murcia', 'Navarra', 'País Vasco', 'La Rioja'], rotation=60)
for p in axes[0].patches:
axes[0].annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Gráfico 2: Porcentaje de Respuestas "Sí" a QF11 por Nivel de Educación
sns.barplot(x=proporcion_si_por_educacion.index, y=proporcion_si_por_educacion, ax=axes[1], palette= 'muted')
axes[1].set_title('Porcentaje de Respuestas "VERY LOW" a QK1 por Nivel de Educación')
axes[1].set_xlabel('Nivel de Educación ISCED')
axes[1].set_ylabel('Porcentaje')
axes[1].set_xticklabels(['Early childhood education', 'Primary education', 'Lower secondary education', 'Upper secondary education', 'Post-secondary non-tertiary education', 'Short-cycle tertiary education', 'Bachelor’s or equivalent level', 'Master’s or equivalent level', 'Doctoral or equivalent level'], rotation=60)
for p in axes[1].patches:
axes[1].annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Gráfico 3: Porcentaje de Respuestas "Sí" a QF11 por Nivel de Ingresos
sns.barplot(x=proporcion_si_por_ingresos.index, y=proporcion_si_por_ingresos, ax=axes[2], palette= 'muted')
axes[2].set_title('Porcentaje de Respuestas "VERY LOW" a QK1 por Nivel de Ingresos')
axes[2].set_xlabel('Nivel de Ingresos')
axes[2].set_ylabel('Porcentaje')
for p in axes[2].patches:
axes[2].annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.tight_layout()
plt.show()
/tmp/ipykernel_2642/4269437081.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=proporcion_si_por_region.index, y=proporcion_si_por_region, ax=axes[0], palette= 'muted') /tmp/ipykernel_2642/4269437081.py:9: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. axes[0].set_xticklabels(['Andalucía', 'Aragón', 'Asturias', 'Baleares', 'Canarias', 'Cantabria', 'Castilla y León', 'Castilla-La Mancha', 'Cataluña', 'Valencia', 'Extremadura', 'Galicia', 'Madrid', 'Murcia', 'Navarra', 'País Vasco', 'La Rioja'], rotation=60) /tmp/ipykernel_2642/4269437081.py:14: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=proporcion_si_por_educacion.index, y=proporcion_si_por_educacion, ax=axes[1], palette= 'muted') /tmp/ipykernel_2642/4269437081.py:18: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. axes[1].set_xticklabels(['Early childhood education', 'Primary education', 'Lower secondary education', 'Upper secondary education', 'Post-secondary non-tertiary education', 'Short-cycle tertiary education', 'Bachelor’s or equivalent level', 'Master’s or equivalent level', 'Doctoral or equivalent level'], rotation=60) /tmp/ipykernel_2642/4269437081.py:23: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=proporcion_si_por_ingresos.index, y=proporcion_si_por_ingresos, ax=axes[2], palette= 'muted')
In [ ]:
# FILTRAR DATASET POR RESPUESTAS A QK2 DISTINTAS DE 200 (respuesta incorrecta)
data_QK2 = data_2021[data_2021['QK2'] != 200]
#
# Análisis demográfico
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 5))
sns.countplot(x= 'QD13', data=data_QK2, ax=axes[0])
sns.countplot(x= 'QD9_ISCED', data=data_QK2, ax=axes[1])
sns.histplot(x= 'QD7', data=data_QK2, ax=axes[2])
sns.countplot(x= 'QD2', data=data_QK2, ax=axes[3])
axes[0].set_title('Distribución de ingresos')
axes[1].set_title('Nivel de Educación')
axes[2].set_title('Distribución de Edades')
axes[3].set_title('Distribución de Regiones')
for p in axes[0].patches:
axes[0].annotate(f'{p.get_height():.1f}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.tight_layout()
plt.show()
nota financial knowledge¶
In [ ]:
# Definir las respuestas correctas y las múltiples respuestas correctas donde sea necesario
respuestas_correctas = {
'QK4': 0, # Asumiendo que '0' es la respuesta correcta para QK4 (sin interés)
'QK5': 102, # Asumiendo que '102' es la respuesta correcta para QK5 (100 principal + 2% de interés)
'QK7_1': True, # Asumiendo que True es la respuesta correcta para QK7_1
'QK7_2': True,
'QK7_3': True
}
# Respuestas múltiples para QK3
data_2021['QK3_puntos'] = data_2021['QK3'].apply(lambda x: 1 if x in [3, 4] else 0)
# Calcular puntos por cada respuesta correcta, excepto QK6
for pregunta, correcta in respuestas_correctas.items():
data_2021[pregunta + '_puntos'] = (data_2021[pregunta] == correcta).astype(int)
# Asignar puntos para QK6 condicionado a QK5 siendo correcto también
data_2021['QK6_puntos'] = ((data_2021['QK6'] == 1) & (data_2021['QK5'] == 102)).astype(int)
# Sumar puntos para obtener la nota final de conocimiento financiero
columnas_puntos = [pregunta + '_puntos' for pregunta in respuestas_correctas] + ['QK3_puntos', 'QK6_puntos']
data_2021['Nota_Financial_Knowledge'] = data_2021[columnas_puntos].sum(axis=1)
# Visualizar el resultado
data_2021[['Nota_Financial_Knowledge'] + columnas_puntos].head()
Out[ ]:
| Nota_Financial_Knowledge | QK4_puntos | QK5_puntos | QK7_1_puntos | QK7_2_puntos | QK7_3_puntos | QK3_puntos | QK6_puntos | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 1 | 5 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
| 2 | 6 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
| 3 | 5 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
| 4 | 5 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
| 5 | 5 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
In [ ]:
#barplot de la variable Nota_Financial_Knowledge
plt.figure(figsize=(10, 6))
sns.countplot(data = data_2021, x='Nota_Financial_Knowledge', palette= 'viridis')
plt.title('Nota de Conocimiento Financiero')
plt.xlabel('Nota')
plt.ylabel('Count')
plt.show()
print(data_2021['Nota_Financial_Knowledge'].describe())
data_2021['Nota_Financial_Knowledge'].value_counts(normalize=True) * 100
/tmp/ipykernel_2642/3598526257.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data = data_2021, x='Nota_Financial_Knowledge', palette= 'viridis')
count 7764.000000 mean 4.748454 std 1.496861 min 0.000000 25% 4.000000 50% 5.000000 75% 6.000000 max 7.000000 Name: Nota_Financial_Knowledge, dtype: float64
Out[ ]:
Nota_Financial_Knowledge 4 22.630088 5 21.780010 6 19.448738 3 14.863472 7 14.605873 2 5.358063 1 1.184956 0 0.128800 Name: proportion, dtype: float64
In [ ]:
# INGRESOS
total_por_ingresos = data_2021['QD13'].value_counts()
high_knowledge = data_2021[data_2021['Nota_Financial_Knowledge'] > 5]['QD13'].value_counts()
proporcion_por_ingresos = (high_knowledge / total_por_ingresos) * 100
# EDUCACION
high_knowledge1 = data_2021[data_2021['Nota_Financial_Knowledge'] > 5]['QD9_ISCED'].value_counts()
proporcion_por_educacion = (high_knowledge1 / total_por_educacion) * 100
# REGION
total_por_region = data_2021['QD2'].value_counts()
high_knowledge2 = data_2021[data_2021['Nota_Financial_Knowledge'] > 5]['QD2'].value_counts()
proporcion_por_region = (high_knowledge2 / total_por_region) * 100
In [ ]:
# FILTRAR DATASET POR Nota_Financial_Knowledge > 4
high_knowledge3 = data_2021[data_2021['Nota_Financial_Knowledge'] >5]
# Análisis demográfico
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 5))
sns.barplot(x= proporcion_por_ingresos.index, y=proporcion_por_ingresos, ax=axes[0])
sns.barplot(x= proporcion_por_educacion.index, y=proporcion_por_educacion, ax=axes[1])
sns.histplot(x= 'QD7', data=high_knowledge3, ax=axes[3])
sns.barplot(x= proporcion_por_region.index, y=proporcion_por_region, ax=axes[2])
axes[0].set_title('Distribución de ingresos')
axes[0].set_ylabel('Porcentaje')
axes[1].set_title('Nivel de Educación')
axes[1].set_ylabel('Porcentaje')
axes[3].set_title('Distribución de Edades')
axes[2].set_title('Distribución de Regiones')
axes[2].set_ylabel('Porcentaje')
# Anotar los valores en los gráficos
for ax in axes[0:3]: # Suponiendo que los primeros tres son gráficos de barras
for p in ax.patches:
ax.annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Ajustar el diseño de los gráficos y mostrarlos
plt.tight_layout()
plt.show()
In [ ]:
# INGRESOS
total_por_ingresos = data_2021['QD13'].value_counts()
low_knowledge = data_2021[data_2021['Nota_Financial_Knowledge'] < 4]['QD13'].value_counts()
proporcion_por_ingresos = (low_knowledge / total_por_ingresos) * 100
# EDUCACION
low_knowledge1 = data_2021[data_2021['Nota_Financial_Knowledge'] < 4]['QD9_ISCED'].value_counts()
proporcion_por_educacion = (low_knowledge1 / total_por_educacion) * 100
# REGION
total_por_region = data_2021['QD2'].value_counts()
low_knowledge2 = data_2021[data_2021['Nota_Financial_Knowledge'] < 4]['QD2'].value_counts()
proporcion_por_region = (low_knowledge2 / total_por_region) * 100
In [ ]:
# FILTRAR DATASET POR Nota_Financial_Knowledge < 4
low_knowledge3 = data_2021[data_2021['Nota_Financial_Knowledge'] < 4]
# Análisis demográfico
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(30, 5))
sns.barplot(x= proporcion_por_ingresos.index, y=proporcion_por_ingresos, ax=axes[0])
sns.barplot(x= proporcion_por_educacion.index, y=proporcion_por_educacion, ax=axes[1])
sns.histplot(x= 'QD7', data=low_knowledge3, ax=axes[3])
sns.barplot(x= proporcion_por_region.index, y=proporcion_por_region, ax=axes[2])
axes[0].set_title('Distribución de ingresos')
axes[0].set_ylabel('Porcentaje')
axes[1].set_title('Nivel de Educación')
axes[1].set_ylabel('Porcentaje')
axes[3].set_title('Distribución de Edades')
axes[2].set_title('Distribución de Regiones')
axes[2].set_ylabel('Porcentaje')
# Anotar los valores en los gráficos
for ax in axes[0:3]: # Suponiendo que los primeros tres son gráficos de barras
for p in ax.patches:
ax.annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Ajustar el diseño de los gráficos y mostrarlos
plt.tight_layout()
plt.show()
In [ ]:
#media de nota_financial_knoledge agrupada por regiones, nivel de ingresos, nivel de educacion y edad
nota_por_region = data_2021.groupby('QD2')['Nota_Financial_Knowledge'].mean()
nota_por_ingresos = data_2021.groupby('QD13')['Nota_Financial_Knowledge'].mean().sort_values(ascending=False)
nota_por_educacion = data_2021.groupby('QD9_ISCED')['Nota_Financial_Knowledge'].mean().sort_values(ascending=False)
nota_por_puesto_trabajo = data_2021.groupby('QD10')['Nota_Financial_Knowledge'].mean()
nota_por_rango_edad = data_2021.groupby('QD7_a')['Nota_Financial_Knowledge'].mean().sort_values(ascending=False)
nota_por_municipio = data_2021.groupby('QD1')['Nota_Financial_Knowledge'].mean().sort_values(ascending=False)
fig, axes = plt.subplots(2, 3, figsize=(25, 12)) # Cambia la disposición y el tamaño de la figura
# Gráfico 1: Nota de Conocimiento Financiero por Región
nota_por_region.plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Nota de Conocimiento Financiero por Región')
axes[0,0].set_xlabel('Región')
axes[0,0].set_ylabel('Nota Media')
axes[0,0].set_xticklabels(['Andalucía', 'Aragón', 'Asturias', 'Baleares', 'Canarias', 'Cantabria', 'Castilla y León', 'Castilla-La Mancha', 'Cataluña', 'Valencia', 'Extremadura', 'Galicia', 'Madrid', 'Murcia', 'Navarra', 'País Vasco', 'La Rioja'], rotation=60)
# Gráfico 2: Nota de Conocimiento Financiero por Nivel de Ingresos
nota_por_ingresos.plot(kind='bar', ax=axes[0,1])
axes[0,1].set_title('Nota de Conocimiento Financiero por Nivel de Ingresos')
axes[0,1].set_xlabel('Nivel de Ingresos')
axes[0,1].set_ylabel('Nota Media')
#grafico 3: Nota de Conocimiento Financiero por rango de edad
nota_por_rango_edad.plot(kind='bar', ax=axes[0,2])
axes[0,2].set_title('Nota de Conocimiento Financiero por rango de edad')
axes[0,2].set_xlabel('rango de edad')
axes[0,2].set_ylabel('Nota Media')
# Gráfico 4: Nota de Conocimiento Financiero por Nivel de Educación
nota_por_educacion.plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Nota de Conocimiento Financiero por Nivel de Educación')
axes[1,0].set_xlabel('Nivel de Educación ISCED')
axes[1,0].set_ylabel('Nota Media')
# Gráfico 5: Nota de Conocimiento Financiero por Tamaño de Municipio
nota_por_puesto_trabajo.plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Nota de Conocimiento Financiero por puesto de trabajo')
axes[1,1].set_xlabel('puesto de trabajo')
axes[1,1].set_ylabel('Nota Media')
axes[1,1].set_xticklabels(['self employed', 'in paid employment', 'looking after home', 'looking for work', 'retired', 'unable to work due to sickness', 'not working and not looking for it', 'student', 'other'])
# Grafico 6: Nota de Conocimiento Financiero por tamaño de municipio
nota_por_municipio.plot(kind='bar', ax=axes[1,2])
axes[1,2].set_title('Nota de Conocimiento Financiero por genero')
axes[1,2].set_xlabel('tamaño de municipio')
axes[1,2].set_ylabel('Nota Media')
# Añadir anotaciones a todos los ejes
for i in range(2):
for j in range(3):
ax = axes[i,j]
for p in ax.patches:
ax.annotate(f'{p.get_height():.1f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Ajustar automáticamente los parámetros de la subtrama para dar espacio al contenido
plt.tight_layout()
plt.show()
In [ ]:
#ver correlacion entre nota_financial_knowledge y las variables demograficas
correlacion = data_2021[['Nota_Financial_Knowledge', 'QD9_ISCED', 'QD7', 'QD2', 'QK1']].corr()
# Visualizar la matriz de correlación
plt.figure(figsize=(10, 6))
sns.heatmap(correlacion, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlación entre Nota de Conocimiento Financiero y Variables Demográficas')
plt.show()
In [ ]:
# Definir las respuestas correctas y las múltiples respuestas correctas donde sea necesario
respuestas_correctas = {
'QK4': 0, # Asumiendo que '0' es la respuesta correcta para QK4 (sin interés)
'QK5': 102, # Asumiendo que '102' es la respuesta correcta para QK5 (100 principal + 2% de interés)
'QK7_1': True, # Asumiendo que True es la respuesta correcta para QK7_1
'QK7_2': True,
'QK7_3': True
}
# Respuestas múltiples para QK3
data_encoded_2021['QK3_puntos'] = data_encoded_2021['QK3'].apply(lambda x: 1 if x in [3, 4] else 0)
# Calcular puntos por cada respuesta correcta, excepto QK6
for pregunta, correcta in respuestas_correctas.items():
data_encoded_2021[pregunta + '_puntos'] = (data_encoded_2021[pregunta] == correcta).astype(int)
# Asignar puntos para QK6 condicionado a QK5 siendo correcto también
data_encoded_2021['QK6_puntos'] = ((data_encoded_2021['QK6'] == 1) & (data_encoded_2021['QK5'] == 102)).astype(int)
# Sumar puntos para obtener la nota final de conocimiento financiero
columnas_puntos = [pregunta + '_puntos' for pregunta in respuestas_correctas] + ['QK3_puntos', 'QK6_puntos']
data_encoded_2021['Nota_Financial_Knowledge'] = data_encoded_2021[columnas_puntos].sum(axis=1)
# Visualizar el resultado
data_encoded_2021[['Nota_Financial_Knowledge'] + columnas_puntos].head()
Out[ ]:
| Nota_Financial_Knowledge | QK4_puntos | QK5_puntos | QK7_1_puntos | QK7_2_puntos | QK7_3_puntos | QK3_puntos | QK6_puntos | |
|---|---|---|---|---|---|---|---|---|
| ID | ||||||||
| 1 | 5 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
| 2 | 6 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
| 3 | 5 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
| 4 | 5 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
| 5 | 5 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
In [ ]:
# eliminar columnas de puntos de data_encoded_2021
data_encoded_2021 = data_encoded_2021.drop(columns=columnas_puntos)
#eliminar columan qd3
data_encoded_2021 = data_encoded_2021.drop(columns='QD3')
cluster analysis¶
In [ ]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Determinar el número óptimo de clusters usando el método del codo
sse = []
k_range = range(1, 15)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data_encoded_2021)
sse.append(kmeans.inertia_)
# Graficar el método del codo
plt.figure(figsize=(10, 6))
plt.plot(k_range, sse, marker='o')
plt.xlabel('Número de Clusters')
plt.ylabel('SSE')
plt.title('Método del Codo para Determinar el Número de Clusters')
plt.show()
In [ ]:
data_numeric = data_encoded_2021.select_dtypes(include=['int', 'float'])
# Escalar los datos
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric) # Escalar los datos numéricos
# # Aplicar PCA
# pca = PCA(n_components=0.95) # Conservar el 95% de la varianza
# data_reduced = pca.fit_transform(data_scaled)
# Aplicar K-Means
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(data_scaled)
# Añadir los clusters al dataframe original
data_encoded_2021['Cluster'] = clusters
In [ ]:
# Comparar edad entre clusters
plt.figure(figsize=(10, 6))
sns.boxplot(x='Cluster', y='QD7', data=data_encoded_2021)
plt.title('Distribución de Edad por Cluster')
plt.show()
# Comparar ingresos entre clusters
plt.figure(figsize=(10, 6))
sns.countplot(x='QD2', hue='Cluster', data=data_encoded_2021)
plt.title('Distribución de region por Cluster')
plt.show()
In [ ]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Cluster', y='Nota_Financial_Knowledge', data=data_encoded_2021)
plt.title('Nota de Conocimiento Financiero por Cluster')
plt.show()
In [ ]:
# Por ejemplo, para el Cluster 0:
cluster_0 = data_encoded_2021[data_encoded_2021['Cluster'] == 0]
print(cluster_0.describe())
# Distribución de edades en el Cluster 0
plt.figure(figsize=(10, 5))
sns.histplot(cluster_0['QD7'], bins=20)
plt.title('Distribución de Edad en el Cluster 0')
plt.xlabel('Edad')
plt.ylabel('Frecuencia')
plt.show()
# Nivel de educación en el Cluster 0
plt.figure(figsize=(10,5))
sns.countplot(x='QD9_ISCED', data=cluster_0)
plt.title('Nivel de Educación en el Cluster 0')
plt.xlabel('Nivel de Educación ISCED')
plt.ylabel('Frecuencia')
plt.show()
# Nota de conocimiento financiero en el Cluster 0
plt.figure(figsize=(10,5))
sns.histplot(cluster_0['Nota_Financial_Knowledge'], bins=20)
plt.title('Nota de Conocimiento Financiero en el Cluster 0')
plt.xlabel('Nota de Conocimiento Financiero')
plt.ylabel('Frecuencia')
plt.show()
wght QD1 QD2 QD5_1 QD5_2 \
count 3267.000000 3267.000000 3267.000000 3267.000000 3267.000000
mean 4624.480430 0.499541 8.827977 0.117845 0.605448
std 3046.917124 0.500076 4.821725 0.322474 0.488829
min 432.794220 0.000000 1.000000 0.000000 0.000000
25% 2430.563600 0.000000 5.000000 0.000000 0.000000
50% 3820.336400 0.000000 9.000000 0.000000 1.000000
75% 6282.485600 1.000000 13.000000 0.000000 1.000000
max 19411.416000 1.000000 17.000000 1.000000 1.000000
QD5_3 QD5_4 QD5_5 QD5_6 QD5_7 ... \
count 3267.000000 3267.000000 3267.000000 3267.000000 3267.000000 ...
mean 0.213958 0.202326 0.265687 0.002143 0.003061 ...
std 0.410160 0.401796 0.441766 0.046246 0.055249 ...
min 0.000000 0.000000 0.000000 0.000000 0.000000 ...
25% 0.000000 0.000000 0.000000 0.000000 0.000000 ...
50% 0.000000 0.000000 0.000000 0.000000 0.000000 ...
75% 0.000000 0.000000 1.000000 0.000000 0.000000 ...
max 1.000000 1.000000 1.000000 1.000000 1.000000 ...
Product_Crypto-assets Product_Dont_know Product_Error \
count 3267.0 3267.000000 3267.000000
mean 0.0 0.001224 0.000306
std 0.0 0.034975 0.017495
min 0.0 0.000000 0.000000
25% 0.0 0.000000 0.000000
50% 0.0 0.000000 0.000000
75% 0.0 0.000000 0.000000
max 0.0 1.000000 1.000000
Product_Insurance Product_Not_applicable \
count 3267.000000 3267.000000
mean 0.000612 0.990511
std 0.024739 0.096962
min 0.000000 0.000000
25% 0.000000 1.000000
50% 0.000000 1.000000
75% 0.000000 1.000000
max 1.000000 1.000000
Product_Not_voluntarily_chosen Product_Refused_to_answer \
count 3267.000000 3267.000000
mean 0.006428 0.000306
std 0.079928 0.017495
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 1.000000 1.000000
Product_Stocks and shares Nota_Financial_Knowledge Cluster
count 3267.000000 3267.000000 3267.0
mean 0.000306 4.523416 0.0
std 0.017495 1.494321 0.0
min 0.000000 0.000000 0.0
25% 0.000000 3.000000 0.0
50% 0.000000 4.000000 0.0
75% 0.000000 6.000000 0.0
max 1.000000 7.000000 0.0
[8 rows x 170 columns]
PCA¶
In [ ]:
#eliminar wght de data_encoded_2021
data_encoded_2021.drop('wght', axis=1, inplace=True)
In [ ]:
from sklearn.preprocessing import StandardScaler
# Normalizar las variables numéricas
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data_encoded_2021)
In [ ]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# Aplicar PCA
pca = PCA(n_components=2) # Reducimos a 2 componentes principales para visualización
data_pca = pca.fit_transform(data_encoded_2021)
# Visualización de los componentes principales
plt.figure(figsize=(10, 6))
plt.scatter(data_pca[:, 0], data_pca[:, 1])
plt.xlabel('Componente Principal 1')
plt.ylabel('Componente Principal 2')
plt.title('PCA: Visualización de Componentes Principales')
plt.show()
# Explicación de la varianza
print("Varianza explicada por cada componente principal:", pca.explained_variance_ratio_)
Varianza explicada por cada componente principal: [0.41837088 0.25503746]
In [ ]:
# Ver las contribuciones de las variables originales a los componentes principales
pca_components = pd.DataFrame(pca.components_, columns=data_encoded_2021.columns)
#print(pca_components)
# Identificar las variables con las mayores contribuciones a PC1 y PC2
print(pca_components.iloc[0].sort_values(ascending=False).head(10)) # Para PC1
print(pca_components.iloc[1].sort_values(ascending=False).head(10)) # Para PC2
QF12_3_1 0.225902 QF12_6_2 0.225890 QF12_7_1 0.225890 QF12_3_2 0.225885 QF12_3_6 0.225883 QF12_3_3 0.225882 QF12_5_3 0.225882 QF12_3_7 0.225882 QF12_1_3 0.225877 QF12_4_1 0.225876 Name: 0, dtype: float64 QP7_98 0.170411 QF9_97 0.149936 Product_Not_applicable 0.124910 QD9 0.101324 Income_Below 15000€ 0.070012 QF12_98 0.056251 QP3_98 0.054912 QD10 0.053547 QF9_99 0.051622 Income_Do not know 0.032567 Name: 1, dtype: float64
arbol de decision para sacar variables relevantes y despues hacer una regresion ordinal¶
In [ ]:
# eliminar variables QK1 a QK7_3
data_encoded_2021.drop(['QK1', 'QK2', 'QK3', 'QK4', 'QK5', 'QK6', 'QK7_1', 'QK7_2', 'QK7_3', 'QD9'], axis=1, inplace=True)
In [ ]:
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
# Definir la variable dependiente y las variables independientes
X = data_encoded_2021.drop(columns=['Nota_Financial_Knowledge'])
y = data_encoded_2021['Nota_Financial_Knowledge']
# Crear y ajustar el modelo de árbol de decisión
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X, y)
# Obtener la importancia de las variables
importance = tree_model.feature_importances_
# Crear un dataframe para mostrar la importancia de las variables
features = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
features = features.sort_values(by='Importance', ascending=False)
# Visualizar la importancia de las variables
plt.figure(figsize=(10, 30))
plt.barh(features['Feature'], features['Importance'])
plt.xlabel('Importancia')
plt.ylabel('Variable')
plt.title('Importancia de las Variables')
plt.gca().invert_yaxis()
plt.show()
In [ ]:
# Importar las libreria necesarias para el modelo de regresión ordinal
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel
# Definir un umbral de importancia
importance_threshold = 0.01
# Filtrar las variables que superan el umbral de importancia
relevant_features = features[features['Importance'] > importance_threshold]['Feature']
print("Variables relevantes:", relevant_features)
# Crear un nuevo DataFrame solo con las variables relevantes
X_relevant = X[relevant_features]
# Ajustar nuevamente el modelo de regresión ordinal con las variables relevantes
model_relevant = OrderedModel(y, X_relevant, distr='logit')
result_relevant = model_relevant.fit(method='bfgs')
# Mostrar el resumen del nuevo modelo
result_relevant.summary()
Variables relevantes: 122 QD9_ISCED
0 QD1
121 QD7
93 QP3_16
1 QD2
109 QS1_2
92 QP3_13
110 QS1_3
113 QS1_7
25 QF8
111 QS1_4
114 QS1_8
78 QP2_12
123 QD10
119 QS3_9
112 QS1_5
120 QS3_11
79 QP2_13
57 QF13
117 QS2_3
115 QS1_10
71 QP2_2
108 QS1_1
12 QF1
10 QD5_ad
Name: Feature, dtype: object
Optimization terminated successfully.
Current function value: 1.577848
Iterations: 65
Function evaluations: 70
Gradient evaluations: 70
Out[ ]:
| Dep. Variable: | Nota_Financial_Knowledge | Log-Likelihood: | -12250. |
|---|---|---|---|
| Model: | OrderedModel | AIC: | 2.456e+04 |
| Method: | Maximum Likelihood | BIC: | 2.479e+04 |
| Date: | Thu, 16 May 2024 | ||
| Time: | 11:41:48 | ||
| No. Observations: | 7764 | ||
| Df Residuals: | 7732 | ||
| Df Model: | 25 |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| QD9_ISCED | 0.3684 | 0.013 | 28.732 | 0.000 | 0.343 | 0.394 |
| QD1 | 1.0341 | 0.043 | 23.996 | 0.000 | 0.950 | 1.119 |
| QD7 | 0.0097 | 0.002 | 6.419 | 0.000 | 0.007 | 0.013 |
| QP3_16 | 0.0068 | 0.001 | 9.301 | 0.000 | 0.005 | 0.008 |
| QD2 | 0.0176 | 0.004 | 4.115 | 0.000 | 0.009 | 0.026 |
| QS1_2 | 0.0062 | 0.002 | 3.259 | 0.001 | 0.002 | 0.010 |
| QP3_13 | -0.0058 | 0.006 | -0.939 | 0.348 | -0.018 | 0.006 |
| QS1_3 | -0.0044 | 0.005 | -0.876 | 0.381 | -0.014 | 0.005 |
| QS1_7 | 0.0077 | 0.003 | 2.574 | 0.010 | 0.002 | 0.014 |
| QF8 | 0.0065 | 0.004 | 1.655 | 0.098 | -0.001 | 0.014 |
| QS1_4 | -0.0014 | 0.006 | -0.244 | 0.807 | -0.013 | 0.010 |
| QS1_8 | 0.0044 | 0.002 | 1.848 | 0.065 | -0.000 | 0.009 |
| QP2_12 | 0.0044 | 0.001 | 4.869 | 0.000 | 0.003 | 0.006 |
| QD10 | -0.0083 | 0.010 | -0.871 | 0.384 | -0.027 | 0.010 |
| QS3_9 | 0.0108 | 0.006 | 1.819 | 0.069 | -0.001 | 0.022 |
| QS1_5 | 0.0019 | 0.004 | 0.504 | 0.614 | -0.005 | 0.009 |
| QS3_11 | 0.0165 | 0.004 | 3.749 | 0.000 | 0.008 | 0.025 |
| QP2_13 | 0.0126 | 0.006 | 2.064 | 0.039 | 0.001 | 0.025 |
| QF13 | 0.0034 | 0.001 | 2.433 | 0.015 | 0.001 | 0.006 |
| QS2_3 | -0.0061 | 0.006 | -0.946 | 0.344 | -0.019 | 0.007 |
| QS1_10 | 0.0026 | 0.005 | 0.468 | 0.640 | -0.008 | 0.013 |
| QP2_2 | 0.0029 | 0.001 | 3.469 | 0.001 | 0.001 | 0.004 |
| QS1_1 | 0.0058 | 0.004 | 1.640 | 0.101 | -0.001 | 0.013 |
| QF1 | -0.1851 | 0.039 | -4.747 | 0.000 | -0.262 | -0.109 |
| QD5_ad | 0.0022 | 0.001 | 3.101 | 0.002 | 0.001 | 0.004 |
| 0/1 | -5.7020 | 0.357 | -15.969 | 0.000 | -6.402 | -5.002 |
| 1/2 | 0.9241 | 0.125 | 7.370 | 0.000 | 0.678 | 1.170 |
| 2/3 | 0.6248 | 0.052 | 12.072 | 0.000 | 0.523 | 0.726 |
| 3/4 | 0.4726 | 0.028 | 16.659 | 0.000 | 0.417 | 0.528 |
| 4/5 | 0.3025 | 0.022 | 13.759 | 0.000 | 0.259 | 0.346 |
| 5/6 | 0.1519 | 0.022 | 6.809 | 0.000 | 0.108 | 0.196 |
| 6/7 | 0.2958 | 0.024 | 12.487 | 0.000 | 0.249 | 0.342 |
In [ ]:
Regresion logistica¶
In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
# Preparar los datos
X = data_encoded_2021.drop(columns=['Nota_Financial_Knowledge'])
y = data_encoded_2021['Nota_Financial_Knowledge']
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Crear el modelo de regresión logística multinomial
logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
logistic_model.fit(X_train, y_train)
# Realizar predicciones
y_pred = logistic_model.predict(X_test)
# Evaluar el modelo
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[ 0 0 0 4 1 0 0 1]
[ 0 1 1 10 10 4 0 0]
[ 4 2 5 37 57 15 11 2]
[ 1 1 5 63 154 54 33 10]
[ 2 1 4 68 225 102 98 20]
[ 1 0 6 32 151 116 150 63]
[ 1 0 0 8 76 101 184 76]
[ 0 0 1 3 33 45 139 138]]
precision recall f1-score support
0 0.00 0.00 0.00 6
1 0.20 0.04 0.06 26
2 0.23 0.04 0.06 133
3 0.28 0.20 0.23 321
4 0.32 0.43 0.37 520
5 0.27 0.22 0.24 519
6 0.30 0.41 0.35 446
7 0.45 0.38 0.41 359
accuracy 0.31 2330
macro avg 0.25 0.22 0.22 2330
weighted avg 0.31 0.31 0.30 2330
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
# Usar una muestra del conjunto de datos para acelerar el ajuste de hiperparámetros
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, train_size=0.1, random_state=42)
# Definir los parámetros a ajustar (espacio reducido)
param_grid = {
'C': [0.1, 1, 10],
'solver': ['newton-cg', ] # saga soporta la regresión multinomial y es rápido
}
# Crear el modelo de regresión logística multinomial
logistic_model = LogisticRegression(multi_class='multinomial', max_iter=20000)
# Configurar GridSearchCV con StratifiedKFold y paralelización
cv = StratifiedKFold(n_splits=3)
grid_search = GridSearchCV(logistic_model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_sample, y_train_sample)
# Mejor modelo encontrado
best_model = grid_search.best_estimator_
print("Mejores hiperparámetros:", grid_search.best_params_)
# Realizar predicciones con el mejor modelo
y_pred_best = best_model.predict(X_test)
# Evaluar el modelo
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
class_report_best = classification_report(y_test, y_pred_best)
print("Matriz de Confusión del Mejor Modelo:")
print(conf_matrix_best)
print("\nInforme de Clasificación del Mejor Modelo:")
print(class_report_best)
Mejores hiperparámetros: {'C': 1, 'solver': 'newton-cg'}
Matriz de Confusión del Mejor Modelo:
[[ 0 0 0 4 1 0 0 1]
[ 0 3 3 9 8 3 0 0]
[ 0 3 10 38 45 17 13 7]
[ 0 12 34 76 95 47 46 11]
[ 0 12 30 80 158 104 95 41]
[ 0 5 24 48 122 123 121 76]
[ 0 2 9 23 68 101 144 99]
[ 0 1 6 15 35 53 128 121]]
Informe de Clasificación del Mejor Modelo:
precision recall f1-score support
0 0.00 0.00 0.00 6
1 0.08 0.12 0.09 26
2 0.09 0.08 0.08 133
3 0.26 0.24 0.25 321
4 0.30 0.30 0.30 520
5 0.27 0.24 0.25 519
6 0.26 0.32 0.29 446
7 0.34 0.34 0.34 359
accuracy 0.27 2330
macro avg 0.20 0.20 0.20 2330
weighted avg 0.27 0.27 0.27 2330
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
random forest¶
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd
# Verificar el número de muestras en la clase minoritaria
class_counts = y_train.value_counts()
min_class_count = class_counts.min()
# Ajustar k_neighbors para SMOTE
k_neighbors = min(5, min_class_count - 1)
# Aplicar SMOTE para balancear las clases en el conjunto de entrenamiento
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
# Verificar el balanceo
print("Distribución de clases antes de SMOTE:")
print(y_train.value_counts())
print("\nDistribución de clases después de SMOTE:")
print(pd.Series(y_train_balanced).value_counts())
Distribución de clases antes de SMOTE: Nota_Financial_Knowledge 4 1237 5 1172 6 1064 3 833 7 775 2 283 1 66 0 4 Name: count, dtype: int64 Distribución de clases después de SMOTE: Nota_Financial_Knowledge 4 1237 5 1237 2 1237 3 1237 7 1237 6 1237 1 1237 0 1237 Name: count, dtype: int64
In [ ]:
# Crear el modelo de Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)
Out[ ]:
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [ ]:
# Realizar predicciones
y_pred_rf = rf_model.predict(X_test)
# Evaluar el modelo
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)
print("Matriz de Confusión del Modelo Random Forest:")
print(conf_matrix_rf)
print("\nInforme de Clasificación del Modelo Random Forest:")
print(class_report_rf)
Matriz de Confusión del Modelo Random Forest:
[[ 0 2 1 1 1 1 0 0]
[ 0 1 4 10 8 3 0 0]
[ 0 5 18 38 46 17 7 2]
[ 0 8 19 104 93 54 33 10]
[ 0 1 30 115 150 103 80 41]
[ 0 0 19 80 107 97 129 87]
[ 0 0 4 25 51 74 169 123]
[ 0 0 1 10 32 30 113 173]]
Informe de Clasificación del Modelo Random Forest:
precision recall f1-score support
0 0.00 0.00 0.00 6
1 0.06 0.04 0.05 26
2 0.19 0.14 0.16 133
3 0.27 0.32 0.30 321
4 0.31 0.29 0.30 520
5 0.26 0.19 0.22 519
6 0.32 0.38 0.35 446
7 0.40 0.48 0.44 359
accuracy 0.31 2330
macro avg 0.22 0.23 0.22 2330
weighted avg 0.30 0.31 0.30 2330
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
# Obtener la importancia de las características
importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
# Visualizar la importancia de las características
plt.figure(figsize=(12, 30))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Importancia de las Variables en el Modelo Random Forest')
plt.show()
Gradient boosting¶
In [ ]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd
# Supongamos que data_encoded es tu DataFrame con las variables codificadas y 'Nota_Financial_Knowledge' es la variable dependiente
# Preparar los datos
X = data_encoded_2021.drop(columns=['Nota_Financial_Knowledge'])
y = data_encoded_2021['Nota_Financial_Knowledge']
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Ajustar k_neighbors para SMOTE
class_counts = y_train.value_counts()
min_class_count = class_counts.min()
k_neighbors = min(5, min_class_count - 1)
# Aplicar SMOTE para balancear las clases en el conjunto de entrenamiento
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
# Verificar el balanceo
print("Distribución de clases antes de SMOTE:")
print(y_train.value_counts())
print("\nDistribución de clases después de SMOTE:")
print(pd.Series(y_train_balanced).value_counts())
Distribución de clases antes de SMOTE: Nota_Financial_Knowledge 4 1237 5 1172 6 1064 3 833 7 775 2 283 1 66 0 4 Name: count, dtype: int64 Distribución de clases después de SMOTE: Nota_Financial_Knowledge 4 1237 5 1237 2 1237 3 1237 7 1237 6 1237 1 1237 0 1237 Name: count, dtype: int64
In [ ]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Crear el modelo de Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
Out[ ]:
GradientBoostingClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingClassifier(random_state=42)
In [ ]:
# Realizar predicciones
y_pred_gb = gb_model.predict(X_test)
# Evaluar el modelo
conf_matrix_gb = confusion_matrix(y_test, y_pred_gb)
class_report_gb = classification_report(y_test, y_pred_gb)
print("Matriz de Confusión del Modelo Gradient Boosting:")
print(conf_matrix_gb)
print("\nInforme de Clasificación del Modelo Gradient Boosting:")
print(class_report_gb)
Matriz de Confusión del Modelo Gradient Boosting:
[[ 0 0 0 3 3 0 0 0]
[ 0 2 0 8 12 4 0 0]
[ 0 2 7 52 48 12 11 1]
[ 0 2 9 80 143 48 32 7]
[ 0 1 4 97 209 113 82 14]
[ 0 3 7 53 138 118 148 52]
[ 0 0 1 10 84 90 177 84]
[ 0 1 1 4 31 53 120 149]]
Informe de Clasificación del Modelo Gradient Boosting:
precision recall f1-score support
0 0.00 0.00 0.00 6
1 0.18 0.08 0.11 26
2 0.24 0.05 0.09 133
3 0.26 0.25 0.25 321
4 0.31 0.40 0.35 520
5 0.27 0.23 0.25 519
6 0.31 0.40 0.35 446
7 0.49 0.42 0.45 359
accuracy 0.32 2330
macro avg 0.26 0.23 0.23 2330
weighted avg 0.32 0.32 0.31 2330
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
XGBOOST¶
In [ ]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
# Preparar los datos para XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# Parámetros de XGBoost
params = {
'objective': 'multi:softmax',
'num_class': len(set(y)), # Número de clases
'max_depth': 6,
'eta': 0.1,
'seed': 42
}
# Entrenar el modelo
xgb_model = xgb.train(params, dtrain, num_boost_round=100)
# Realizar predicciones
y_pred_xgb = xgb_model.predict(dtest)
# Evaluar el modelo
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)
print("Matriz de Confusión del Modelo XGBoost:")
print(conf_matrix_xgb)
print("\nInforme de Clasificación del Modelo XGBoost:")
print(class_report_xgb)
Matriz de Confusión del Modelo XGBoost:
[[ 0 1 0 4 1 0 0 0]
[ 0 0 0 13 11 2 0 0]
[ 0 2 5 45 52 14 13 2]
[ 0 2 6 76 145 54 33 5]
[ 0 1 6 92 205 106 91 19]
[ 0 0 2 57 140 123 131 66]
[ 0 0 0 6 92 80 170 98]
[ 0 0 0 4 28 59 126 142]]
Informe de Clasificación del Modelo XGBoost:
precision recall f1-score support
0 0.00 0.00 0.00 6
1 0.00 0.00 0.00 26
2 0.26 0.04 0.07 133
3 0.26 0.24 0.25 321
4 0.30 0.39 0.34 520
5 0.28 0.24 0.26 519
6 0.30 0.38 0.34 446
7 0.43 0.40 0.41 359
accuracy 0.31 2330
macro avg 0.23 0.21 0.21 2330
weighted avg 0.30 0.31 0.30 2330
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/workspaces/TFM/.conda/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
In [ ]:
RED NEURONAL¶
In [ ]:
# Importar las librerias necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
# Suponiendo que data_encoded es tu DataFrame con las variables codificadas y 'Nota_Financial_Knowledge' es la variable dependiente
# Preparar los datos
X = data_encoded_2021.drop(columns=['Nota_Financial_Knowledge'])
y = data_encoded_2021['Nota_Financial_Knowledge']
In [ ]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Escalar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Codificar las etiquetas en formato one-hot
encoder = OneHotEncoder(sparse_output=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))
In [ ]:
# Definir la arquitectura de la red neuronal
model = Sequential()
# Capa de entrada
model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'))
# Capas ocultas
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5)) # Dropout para reducir el overfitting
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5)) # Dropout para reducir el overfitting
# Capa de salida
model.add(Dense(len(encoder.categories_[0]), activation='softmax'))
# Compilar el modelo
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
/workspaces/TFM/.conda/lib/python3.11/site-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs)
In [ ]:
# Entrenar el modelo
history = model.fit(X_train_scaled, y_train_encoded, epochs=50, batch_size=32, validation_data=(X_test_scaled, y_test_encoded))
Epoch 1/50
170/170 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.1611 - loss: 2.1815 - val_accuracy: 0.2850 - val_loss: 1.7193 Epoch 2/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.2574 - loss: 1.7763 - val_accuracy: 0.2931 - val_loss: 1.6440 Epoch 3/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.2711 - loss: 1.6866 - val_accuracy: 0.3052 - val_loss: 1.6311 Epoch 4/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.3083 - loss: 1.6297 - val_accuracy: 0.3099 - val_loss: 1.6127 Epoch 5/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3000 - loss: 1.6183 - val_accuracy: 0.3223 - val_loss: 1.6079 Epoch 6/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3211 - loss: 1.5917 - val_accuracy: 0.3034 - val_loss: 1.6225 Epoch 7/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3215 - loss: 1.5607 - val_accuracy: 0.3163 - val_loss: 1.6145 Epoch 8/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3272 - loss: 1.5727 - val_accuracy: 0.3103 - val_loss: 1.6186 Epoch 9/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3235 - loss: 1.5448 - val_accuracy: 0.3193 - val_loss: 1.6172 Epoch 10/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3378 - loss: 1.5329 - val_accuracy: 0.3064 - val_loss: 1.6287 Epoch 11/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3438 - loss: 1.5246 - val_accuracy: 0.3047 - val_loss: 1.6278 Epoch 12/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3371 - loss: 1.5300 - val_accuracy: 0.3026 - val_loss: 1.6352 Epoch 13/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3399 - loss: 1.5211 - val_accuracy: 0.3112 - val_loss: 1.6326 Epoch 14/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - accuracy: 0.3467 - loss: 1.5116 - val_accuracy: 0.3073 - val_loss: 1.6340 Epoch 15/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3457 - loss: 1.4991 - val_accuracy: 0.3150 - val_loss: 1.6518 Epoch 16/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3550 - loss: 1.4771 - val_accuracy: 0.3107 - val_loss: 1.6405 Epoch 17/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3640 - loss: 1.4794 - val_accuracy: 0.3004 - val_loss: 1.6596 Epoch 18/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3642 - loss: 1.4743 - val_accuracy: 0.3120 - val_loss: 1.6846 Epoch 19/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3538 - loss: 1.4664 - val_accuracy: 0.2961 - val_loss: 1.6948 Epoch 20/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3692 - loss: 1.4479 - val_accuracy: 0.2974 - val_loss: 1.6933 Epoch 21/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3612 - loss: 1.4405 - val_accuracy: 0.3052 - val_loss: 1.6981 Epoch 22/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3722 - loss: 1.4279 - val_accuracy: 0.2845 - val_loss: 1.7076 Epoch 23/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3749 - loss: 1.4321 - val_accuracy: 0.2966 - val_loss: 1.7117 Epoch 24/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3724 - loss: 1.4182 - val_accuracy: 0.2987 - val_loss: 1.7300 Epoch 25/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3821 - loss: 1.4121 - val_accuracy: 0.3043 - val_loss: 1.7472 Epoch 26/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3874 - loss: 1.3947 - val_accuracy: 0.2828 - val_loss: 1.7378 Epoch 27/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3780 - loss: 1.4052 - val_accuracy: 0.2871 - val_loss: 1.7780 Epoch 28/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3651 - loss: 1.4157 - val_accuracy: 0.2918 - val_loss: 1.8060 Epoch 29/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3880 - loss: 1.3776 - val_accuracy: 0.3013 - val_loss: 1.7783 Epoch 30/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - accuracy: 0.3889 - loss: 1.3703 - val_accuracy: 0.2979 - val_loss: 1.7827 Epoch 31/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4031 - loss: 1.3627 - val_accuracy: 0.2897 - val_loss: 1.8407 Epoch 32/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3868 - loss: 1.3774 - val_accuracy: 0.2923 - val_loss: 1.8641 Epoch 33/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3966 - loss: 1.3574 - val_accuracy: 0.2833 - val_loss: 1.8933 Epoch 34/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4035 - loss: 1.3512 - val_accuracy: 0.2863 - val_loss: 1.9011 Epoch 35/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.4128 - loss: 1.3378 - val_accuracy: 0.2880 - val_loss: 1.9666 Epoch 36/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.4372 - loss: 1.3167 - val_accuracy: 0.2918 - val_loss: 2.0194 Epoch 37/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4298 - loss: 1.3233 - val_accuracy: 0.2820 - val_loss: 1.9680 Epoch 38/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4270 - loss: 1.3143 - val_accuracy: 0.2880 - val_loss: 1.9863 Epoch 39/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.4150 - loss: 1.3290 - val_accuracy: 0.2833 - val_loss: 2.0480 Epoch 40/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.4342 - loss: 1.2871 - val_accuracy: 0.2794 - val_loss: 2.0627 Epoch 41/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4175 - loss: 1.2974 - val_accuracy: 0.2867 - val_loss: 2.1331 Epoch 42/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4248 - loss: 1.2847 - val_accuracy: 0.2760 - val_loss: 2.1523 Epoch 43/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.4340 - loss: 1.2974 - val_accuracy: 0.2824 - val_loss: 2.1376 Epoch 44/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.4433 - loss: 1.2649 - val_accuracy: 0.2815 - val_loss: 2.2964 Epoch 45/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - accuracy: 0.4455 - loss: 1.2590 - val_accuracy: 0.2790 - val_loss: 2.1692 Epoch 46/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4420 - loss: 1.2673 - val_accuracy: 0.2773 - val_loss: 2.2392 Epoch 47/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4487 - loss: 1.2620 - val_accuracy: 0.2803 - val_loss: 2.4333 Epoch 48/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4415 - loss: 1.2660 - val_accuracy: 0.2837 - val_loss: 2.3534 Epoch 49/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4685 - loss: 1.2076 - val_accuracy: 0.2781 - val_loss: 2.4696 Epoch 50/50 170/170 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4658 - loss: 1.2329 - val_accuracy: 0.2712 - val_loss: 2.3706
In [ ]:
# Evaluar el modelo en el conjunto de prueba
loss, accuracy = model.evaluate(X_test_scaled, y_test_encoded)
print(f'Precisión del modelo en el conjunto de prueba: {accuracy * 100:.2f}%')
# Realizar predicciones
y_pred_prob = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)
# Convertir las etiquetas one-hot a etiquetas originales
y_test_original = np.argmax(y_test_encoded, axis=1)
# Evaluar el modelo
conf_matrix_nn = confusion_matrix(y_test_original, y_pred)
class_report_nn = classification_report(y_test_original, y_pred)
print("Matriz de Confusión del Modelo de Red Neuronal:")
print(conf_matrix_nn)
print("\nInforme de Clasificación del Modelo de Red Neuronal:")
print(class_report_nn)
73/73 ━━━━━━━━━━━━━━━━━━━━ 0s 637us/step - accuracy: 0.2734 - loss: 2.2322 Precisión del modelo en el conjunto de prueba: 27.12% 73/73 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
73/73 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step Matriz de Confusión del Modelo de Red Neuronal: [[ 0 0 1 2 1 0 0 2] [ 0 0 1 12 7 6 0 0] [ 0 0 7 59 34 23 8 2] [ 1 2 12 106 97 74 24 5] [ 0 0 7 145 143 128 69 28] [ 0 0 4 85 116 128 132 54] [ 0 1 5 44 59 115 138 84] [ 0 0 0 16 31 72 130 110]] Informe de Clasificación del Modelo de Red Neuronal: precision recall f1-score support 0 0.00 0.00 0.00 6 1 0.00 0.00 0.00 26 2 0.19 0.05 0.08 133 3 0.23 0.33 0.27 321 4 0.29 0.28 0.28 520 5 0.23 0.25 0.24 519 6 0.28 0.31 0.29 446 7 0.39 0.31 0.34 359 accuracy 0.27 2330 macro avg 0.20 0.19 0.19 2330 weighted avg 0.27 0.27 0.27 2330
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: